Back to Analysis Index

Load libraries

library(limma)
library(Glimma)
library(tidyverse)
library(DESeq2)
library(org.Hs.eg.db)
library(pheatmap)
library(EnhancedVolcano)
library(ComplexHeatmap)
library(gplots)
library(msigdbr)
library(clusterProfiler)
library(enrichplot)
library(DOSE)
library(gridExtra)
library(grid)

Motivation

Set up enivronment

Preferences

set.seed(42)

Load counts

# These counts were generated on the HPC using RSubRead FeatureCounts package
load("source_data/FeatCounts_Rsub_run35.RData")
counts_run35 <- counts$counts
load("source_data/FeatCounts_Rsub_run36.RData")
counts_run36 <- counts$counts
head(counts_run35[1:4,1:4])
##           MERNA1.bam MERNA10.bam MERNA11.bam MERNA12.bam
## 100287102         10          15          12          21
## 653635           498         271         417         423
## 102466751          4           7           8           9
## 100302278          1           0           0           0

Run 35

counts <- counts_run35

Coldata

coldata <- tibble::tribble(
  ~SeqID, ~healthy_donor, ~condition,
  "MERNA1", "D1", "IL2",
  "MERNA2", "D1", "IL15",
  "MERNA3", "D1", "IL21",
  "MERNA4", "D1", "IL7",
  "MERNA5", "D1", "UST",
  "MERNA6", "D2", "IL2",
  "MERNA7", "D2", "IL15",
  "MERNA8", "D2", "IL21",
  "MERNA9", "D2", "IL7",
  "MERNA10", "D2", "UST",
  "MERNA11", "D3", "IL2",
  "MERNA12", "D3", "IL15",
  "MERNA13", "D3", "IL21",
  "MERNA14", "D3", "IL7",
  "MERNA15", "D3", "UST"
)

Rename gene list

gene.list <- data.frame(entrez.id = rownames(counts))
gene.list <- gene.list %>%
  dplyr::mutate(
    SYMBOL = mapIds(org.Hs.eg.db, keys = gene.list$entrez.id, column = "SYMBOL", keytype = "ENTREZID"),
    GENETYPE = mapIds(org.Hs.eg.db, keys = gene.list$entrez.id, column = "GENETYPE", keytype = "ENTREZID"),
    ENSEMBL = mapIds(org.Hs.eg.db, keys = gene.list$entrez.id, column = "ENSEMBL", keytype = "ENTREZID")
  )

# Rename to SYMBOL
filtered.gene.list <- gene.list %>%
  dplyr::filter(
    !is.na(SYMBOL),
    !duplicated(SYMBOL))

# Rename gene IDs from ensembl to symbol
counts <- counts[rownames(counts) %in% filtered.gene.list$entrez.id, ]
rownames(counts) <- filtered.gene.list$SYMBOL[match(rownames(counts), filtered.gene.list$entrez.id)]
counts[1:4,1:4]
##           MERNA1.bam MERNA10.bam MERNA11.bam MERNA12.bam
## DDX11L1           10          15          12          21
## WASH7P           498         271         417         423
## MIR6859-1          4           7           8           9
## MIR1302-2          1           0           0           0
# Remove the ".bam" extension from colnames of counts
colnames(counts) <- sub(".bam", "", colnames(counts))

# Reorder columns of counts to match the order in coldata$SeqID
counts_run35 <- counts[, match(coldata$SeqID, colnames(counts))]

saveRDS(counts_run35, "counts_run35.rds")

Generate DESeq2 object

dds <- DESeqDataSetFromMatrix(counts_run35, coldata, design = ~healthy_donor + condition)
dds <- estimateSizeFactors(dds)
idx <- rowSums(counts(dds, normalized=TRUE) >= 10 ) >= 3
dds <- dds[idx,]
dds <- DESeq(dds)
vst <- varianceStabilizingTransformation(dds)
vsn::meanSdPlot(assay(vst))

plotPCA(vst, intgroup = c("healthy_donor"))

plotPCA(vst, intgroup = c("condition"))

sizeFactors(dds)
##    MERNA1    MERNA2    MERNA3    MERNA4    MERNA5    MERNA6    MERNA7    MERNA8 
## 1.8058775 1.4788977 1.2524641 0.8612628 0.7966731 0.7554115 1.4088596 0.8897406 
##    MERNA9   MERNA10   MERNA11   MERNA12   MERNA13   MERNA14   MERNA15 
## 0.9966245 0.6587678 0.9831123 0.6706531 1.3473446 0.9526336 0.9431699
normalized_counts <- counts(dds, normalized=TRUE)

DESeq2 results - Generalised linear model (Wald test)

res.il15 <- results(dds, contrast = c("condition","IL21","IL15"))
res.sig.il15 <- subset(res.il15, padj < 0.05)
lfc.il15 <- res.sig.il15[ order(res.sig.il15$log2FoldChange, decreasing=TRUE), ]
summary(lfc.il15)
## 
## out of 600 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 122, 20%
## LFC < 0 (down)     : 478, 80%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
lfc.il15
## log2 fold change (MLE): condition IL21 vs IL15 
## Wald test p-value: condition IL21 vs IL15 
## DataFrame with 600 rows and 6 columns
##            baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##           <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## CXCL13     86.02403       11.66732   2.55013   4.57518 4.75800e-06 7.75444e-04
## COL6A3    401.77920        7.12769   1.29528   5.50281 3.73786e-08 3.63717e-05
## RDH8       57.46700        7.04912   1.84572   3.81917 1.33902e-04 7.09086e-03
## GREM1      21.04118        6.90119   1.88518   3.66076 2.51472e-04 1.12739e-02
## NCAN        6.16191        6.34385   1.80852   3.50776 4.51896e-04 1.77666e-02
## ...             ...            ...       ...       ...         ...         ...
## RGS8        25.0830       -7.08206   1.58708  -4.46231 8.10807e-06 0.001048143
## OLAH        52.6620       -7.13134   1.38420  -5.15197 2.57758e-07 0.000121453
## SOCS2-AS1   23.8220       -7.18129   1.64787  -4.35792 1.31304e-05 0.001481357
## ENPEP       84.1598       -8.93951   1.78546  -5.00685 5.53280e-07 0.000205096
## HMCN1      117.7320       -9.42187   1.68347  -5.59669 2.18482e-08 0.000027624
res.il2 <- results(dds,contrast = c("condition","IL21","IL2"))
res.sig.il2 <- subset(res.il2, padj < 0.05)
lfc.il2 <- res.sig.il2[ order(res.sig.il2$log2FoldChange, decreasing=TRUE), ]
summary(lfc.il2)
## 
## out of 1779 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 609, 34%
## LFC < 0 (down)     : 1170, 66%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
lfc.il2
## log2 fold change (MLE): condition IL21 vs IL2 
## Wald test p-value: condition IL21 vs IL2 
## DataFrame with 1779 rows and 6 columns
##           baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##          <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## CXCL13    86.02403       10.05458   2.46592   4.07742 4.55374e-05 1.35042e-03
## IGKV1-9   31.92469        7.22644   1.27779   5.65541 1.55474e-08 1.62022e-06
## IGHV3-74   6.05461        6.73644   1.80041   3.74162 1.82836e-04 4.17046e-03
## IGHV3-48   6.46942        6.73370   1.95990   3.43573 5.90966e-04 1.03495e-02
## COL6A3   401.77920        6.65152   1.29346   5.14242 2.71224e-07 1.91072e-05
## ...            ...            ...       ...       ...         ...         ...
## RGS8       25.0830       -7.82936   1.58029  -4.95439 7.25569e-07 4.31160e-05
## OLAH       52.6620       -7.99767   1.37946  -5.79768 6.72371e-09 8.11483e-07
## DNAJC22    50.1076       -9.07540   1.52727  -5.94224 2.81154e-09 3.91458e-07
## ENPEP      84.1598       -9.88347   1.78332  -5.54216 2.98756e-08 2.76282e-06
## HMCN1     117.7320      -10.32902   1.68161  -6.14235 8.13102e-10 1.34672e-07
lfc.il15.df <- lfc.il15 %>% as.data.frame() %>% rownames_to_column(.,var = 'GENE_ID')
lfc.il2.df <- lfc.il2 %>% as.data.frame() %>% rownames_to_column(.,var = 'GENE_ID')

Volcano plot

# Set fold change and p-value cutoffs
fold_cutoff <- 2.5
pvalue_cutoff <- 0.05

label_fold_cutoff <- 7
label_pvalue_cutoff <- 0.05

# Filter data based on cutoffs
res.il2.labels <- subset(res.il2, abs(log2FoldChange) > label_fold_cutoff & -log10(padj) > -log10(label_pvalue_cutoff))
res.il15.labels <- subset(res.il15, abs(log2FoldChange) > label_fold_cutoff & -log10(padj) > -log10(label_pvalue_cutoff))

res.il2.labels <- unique(paste0(c(rownames(res.il2.labels), "CXCL13", "IGFBP4", "TNFRSF8", "OSM", "IFNG", "DUSP6", "LTA", "SOCS2", "CSF2", "FCRL2")))
res.il15.labels <- unique(paste0(c(rownames(res.il15.labels), "CXCL13", "IGFBP4", "TNFRSF8", "OSM", "IFNG", "DUSP6", "LTA", "SOCS2", "CSF2", "FCRL2")))

filtered_data_intersect <- intersect(res.il2.labels, res.il15.labels)

# Create the first EnhancedVolcano plot for IL21 versus IL2
plot1_run35 <- EnhancedVolcano(res.il2,
    lab = rownames(res.il2),
    selectLab = filtered_data_intersect,
    drawConnectors = TRUE,
    x = 'log2FoldChange',
    y = 'padj',
    title = 'IL21 versus IL2',
    pCutoff = pvalue_cutoff,
    FCcutoff = fold_cutoff,
    pointSize = 3.0,
    boxedLabels = TRUE,
    labSize = 6.0)

# Create the second EnhancedVolcano plot for IL21 versus IL15
plot2_run35 <- EnhancedVolcano(res.il15,
    lab = rownames(res.il15),
    selectLab = filtered_data_intersect,
    drawConnectors = TRUE,
    x = 'log2FoldChange',
    y = 'padj',
    title = 'IL21 versus IL15',
    pCutoff = pvalue_cutoff,
    FCcutoff = fold_cutoff,
    pointSize = 3.0,
    boxedLabels = TRUE,
    labSize = 6.0)

# Arrange the two plots side by side using gridExtra
grid.arrange(plot1_run35, plot2_run35, ncol = 2)

Run 36

counts <- counts_run36

Coldata

coldata <- tibble::tribble(
  ~SeqID, ~healthy_donor, ~condition,
  "MERNA1", "D1", "IL2",
  "MERNA2", "D1", "IL15",
  "MERNA3", "D1", "IL21",
  "MERNA4", "D1", "IL7",
  "MERNA5", "D1", "UST",
  "MERNA6", "D2", "IL2",
  "MERNA7", "D2", "IL15",
  "MERNA8", "D2", "IL21",
  "MERNA9", "D2", "IL7",
  "MERNA10", "D2", "UST",
  "MERNA11", "D3", "IL2",
  "MERNA12", "D3", "IL15",
  "MERNA13", "D3", "IL21",
  "MERNA14", "D3", "IL7",
  "MERNA15", "D3", "UST"
)

Rename gene list

gene.list <- data.frame(entrez.id = rownames(counts))
gene.list <- gene.list %>%
  dplyr::mutate(
    SYMBOL = mapIds(org.Hs.eg.db, keys = gene.list$entrez.id, column = "SYMBOL", keytype = "ENTREZID"),
    GENETYPE = mapIds(org.Hs.eg.db, keys = gene.list$entrez.id, column = "GENETYPE", keytype = "ENTREZID"),
    ENSEMBL = mapIds(org.Hs.eg.db, keys = gene.list$entrez.id, column = "ENSEMBL", keytype = "ENTREZID")
  )

# Rename to SYMBOL
filtered.gene.list <- gene.list %>%
  dplyr::filter(
    !is.na(SYMBOL),
    !duplicated(SYMBOL))

# Rename gene IDs from ensembl to symbol
counts <- counts[rownames(counts) %in% filtered.gene.list$entrez.id, ]
rownames(counts) <- filtered.gene.list$SYMBOL[match(rownames(counts), filtered.gene.list$entrez.id)]
counts[1:4,1:4]
##           MERNA1.bam MERNA10.bam MERNA11.bam MERNA12.bam
## DDX11L1            2          20          15          37
## WASH7P            85         487         406         980
## MIR6859-1          0          10           7          33
## MIR1302-2          0           0           0           0
# Remove the ".bam" extension from colnames of counts
colnames(counts) <- sub(".bam", "", colnames(counts))

# Reorder columns of counts to match the order in coldata$SeqID
counts_run36 <- counts[, match(coldata$SeqID, colnames(counts))]

saveRDS(counts_run36, "counts_run36.rds")

Generate DESeq2 object

dds <- DESeqDataSetFromMatrix(counts_run36, coldata, design = ~healthy_donor + condition)
dds <- estimateSizeFactors(dds)
idx <- rowSums(counts(dds, normalized=TRUE) >= 10 ) >= 3
dds <- dds[idx,]
dds <- DESeq(dds)
vst <- varianceStabilizingTransformation(dds)
vsn::meanSdPlot(assay(vst))

plotPCA(vst, intgroup = c("healthy_donor"))

plotPCA(vst, intgroup = c("condition"))

sizeFactors(dds)
##    MERNA1    MERNA2    MERNA3    MERNA4    MERNA5    MERNA6    MERNA7    MERNA8 
## 0.3633083 0.2570585 1.1260936 1.7044318 1.5821346 1.4238734 0.6168305 1.5043697 
##    MERNA9   MERNA10   MERNA11   MERNA12   MERNA13   MERNA14   MERNA15 
## 1.2135284 1.3913264 1.1655990 1.7515148 0.6391892 1.0577737 1.3042779
normalized_counts <- counts(dds, normalized=TRUE)

DESeq2 results - Generalised linear model (Wald test)

res.il15 <- results(dds, contrast = c("condition","IL21","IL15"))
res.sig.il15 <- subset(res.il15, padj < 0.05)
lfc.il15 <- res.sig.il15[ order(res.sig.il15$log2FoldChange, decreasing=TRUE), ]
summary(lfc.il15)
## 
## out of 514 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 88, 17%
## LFC < 0 (down)     : 426, 83%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 31)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
lfc.il15
## log2 fold change (MLE): condition IL21 vs IL15 
## Wald test p-value: condition IL21 vs IL15 
## DataFrame with 514 rows and 6 columns
##          baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##         <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## CXCL13    70.6592        8.50599  2.022667   4.20533 2.60697e-05 2.01833e-03
## COL6A3   342.2097        6.69035  1.283910   5.21092 1.87910e-07 9.21659e-05
## TNFRSF8  370.2915        4.51090  1.334518   3.38017 7.24409e-04 2.34372e-02
## BCAT1   1154.2222        4.36151  0.970938   4.49206 7.05370e-06 1.00123e-03
## JCHAIN   113.2250        4.09748  0.872142   4.69818 2.62495e-06 5.23175e-04
## ...           ...            ...       ...       ...         ...         ...
## IL13      52.1665       -5.83101   1.42019  -4.10580 4.02916e-05 0.002685085
## DNAJC22   42.5352       -6.09647   1.51633  -4.02056 5.80610e-05 0.003613931
## OLAH      41.8109       -7.55617   1.53253  -4.93051 8.20136e-07 0.000245281
## HMCN1     96.9493       -7.86271   1.54524  -5.08834 3.61212e-07 0.000138412
## ENPEP     65.9906       -7.91695   1.90355  -4.15905 3.19569e-05 0.002346438
res.il2 <- results(dds,contrast = c("condition","IL21","IL2"))
res.sig.il2 <- subset(res.il2, padj < 0.05)
lfc.il2 <- res.sig.il2[ order(res.sig.il2$log2FoldChange, decreasing=TRUE), ]
summary(lfc.il2)
## 
## out of 1648 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 567, 34%
## LFC < 0 (down)     : 1081, 66%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
lfc.il2
## log2 fold change (MLE): condition IL21 vs IL2 
## Wald test p-value: condition IL21 vs IL2 
## DataFrame with 1648 rows and 6 columns
##           baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##          <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## CXCL13    70.65920        9.54598   2.07407   4.60254 4.17371e-06 1.94743e-04
## COL6A3   342.20974        6.76980   1.27357   5.31559 1.06310e-07 8.58810e-06
## IGKV2-24   3.55377        6.09640   1.97368   3.08886 2.00928e-03 2.59230e-02
## IGHV3-74   5.27194        5.92522   1.92669   3.07534 2.10266e-03 2.67645e-02
## FCRL5     27.30764        5.61344   1.29536   4.33349 1.46763e-05 5.61754e-04
## ...            ...            ...       ...       ...         ...         ...
## HSPA4L     24.7930       -7.90183   1.41828  -5.57142 2.52666e-08 2.44796e-06
## OLAH       41.8109       -8.40829   1.52690  -5.50676 3.65508e-08 3.41089e-06
## DNAJC22    42.5352       -8.69111   1.48548  -5.85071 4.89475e-09 6.15322e-07
## HMCN1      96.9493       -8.78465   1.54264  -5.69454 1.23704e-08 1.36352e-06
## ENPEP      65.9906       -9.47420   1.89768  -4.99251 5.95995e-07 3.78337e-05
lfc.il15.df <- lfc.il15 %>% as.data.frame() %>% rownames_to_column(.,var = 'GENE_ID')
lfc.il2.df <- lfc.il2 %>% as.data.frame() %>% rownames_to_column(.,var = 'GENE_ID')

Volcano plot

# Set fold change and p-value cutoffs
fold_cutoff <- 2.5
pvalue_cutoff <- 0.05

label_fold_cutoff <- 7
label_pvalue_cutoff <- 0.05

# Filter data based on cutoffs
res.il2.labels <- subset(res.il2, abs(log2FoldChange) > label_fold_cutoff & -log10(padj) > -log10(label_pvalue_cutoff))
res.il15.labels <- subset(res.il15, abs(log2FoldChange) > label_fold_cutoff & -log10(padj) > -log10(label_pvalue_cutoff))

res.il2.labels <- unique(paste0(c(rownames(res.il2.labels), "CXCL13", "IGFBP4", "TNFRSF8", "OSM", "IFNG", "DUSP6", "LTA", "SOCS2", "CSF2", "FCRL2")))
res.il15.labels <- unique(paste0(c(rownames(res.il15.labels), "CXCL13", "IGFBP4", "TNFRSF8", "OSM", "IFNG", "DUSP6", "LTA", "SOCS2", "CSF2", "FCRL2")))

filtered_data_intersect <- intersect(res.il2.labels, res.il15.labels)

# Create the first EnhancedVolcano plot for IL21 versus IL2
plot1_run36 <- EnhancedVolcano(res.il2,
    lab = rownames(res.il2),
    selectLab = filtered_data_intersect,
    drawConnectors = TRUE,
    x = 'log2FoldChange',
    y = 'padj',
    title = 'IL21 versus IL2',
    pCutoff = pvalue_cutoff,
    FCcutoff = fold_cutoff,
    pointSize = 3.0,
    boxedLabels = TRUE,
    labSize = 6.0)

# Create the second EnhancedVolcano plot for IL21 versus IL15
plot2_run36 <- EnhancedVolcano(res.il15,
    lab = rownames(res.il15),
    selectLab = filtered_data_intersect,
    drawConnectors = TRUE,
    x = 'log2FoldChange',
    y = 'padj',
    title = 'IL21 versus IL15',
    pCutoff = pvalue_cutoff,
    FCcutoff = fold_cutoff,
    pointSize = 3.0,
    boxedLabels = TRUE,
    labSize = 6.0)

# Arrange the two plots side by side using gridExtra
grid.arrange(plot1_run36, plot2_run36, ncol = 2)

Show all volcanoes

# Create titles for the different runs
title_run35 <- textGrob("Run 35", gp = gpar(fontsize = 40, fontface = "bold"))
title_run36 <- textGrob("Run 36", gp = gpar(fontsize = 40, fontface = "bold"))

# Arrange the plots in a grid layout
grid.arrange(
    title_run35, nullGrob(), plot1_run35, plot2_run35,
    title_run36, nullGrob(), plot1_run36, plot2_run36,
    ncol = 2,
    layout_matrix = rbind(c(1, 2),
                          c(3, 4),
                          c(5, 6),
                          c(7, 8)),
    heights = c(0.1, 1, 0.1, 1)
)

Sequencing saturation

The following plots show the estimated sequencing saturation for the two sequencing runs conducted for this project. The code and detailed outs are found in this directory. The code is not run in this directory because the sampling is computationally taxing; so I have ran that independently of the main markdown and just showing the final plots here.

Estimated sequencing saturation for sequencing run 35
Estimated sequencing saturation for sequencing run 35
Estimated sequencing saturation for sequencing run 36
Estimated sequencing saturation for sequencing run 36

Conclusion

The sequencing runs are called 35 and 36 because this is the numbers given by the sequencing hub, and have just used this code as shorthand for describing the separate sequencing runs. Go to the HPC analysis script for the long sequencing name.

The plots for run 35 and run 36 are very similar. They both reach random sampling sequencing saturation. The technical batch effect will be greater than any extra information by combining the sequencing runs. Therefore, the final analysis will just contain sequencing run 35.

The final analysis for the 2024 paper is the same as is shown in the ‘run 35’ chunks - with IL-7 removed as this was not necessary for the paper.